In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as colors
import plotly.io as pio
In [2]:
#pio.renderers.default = 'png'  # Forces static images for Plotly outputs
In [3]:
df=pd.read_csv("sales_forecasting.csv", encoding="latin1")
In [4]:
df.head()
Out[4]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City ... Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
2 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles ... 90036 West OFF-LA-10000240 Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714
3 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
4 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311 South OFF-ST-10000760 Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164

5 rows × 21 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity       9994 non-null   int64  
 19  Discount       9994 non-null   float64
 20  Profit         9994 non-null   float64
dtypes: float64(3), int64(3), object(15)
memory usage: 1.6+ MB
In [6]:
df.describe()
Out[6]:
Row ID Postal Code Sales Quantity Discount Profit
count 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000
mean 4997.500000 55190.379428 229.858001 3.789574 0.156203 28.656896
std 2885.163629 32063.693350 623.245101 2.225110 0.206452 234.260108
min 1.000000 1040.000000 0.444000 1.000000 0.000000 -6599.978000
25% 2499.250000 23223.000000 17.280000 2.000000 0.000000 1.728750
50% 4997.500000 56430.500000 54.490000 3.000000 0.200000 8.666500
75% 7495.750000 90008.000000 209.940000 5.000000 0.200000 29.364000
max 9994.000000 99301.000000 22638.480000 14.000000 0.800000 8399.976000

covert the datatype of date columns¶

In [7]:
df['Ship Date']= pd.to_datetime(df['Ship Date'])
df['Order Date']= pd.to_datetime(df['Order Date'])
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9994 non-null   int64         
 1   Order ID       9994 non-null   object        
 2   Order Date     9994 non-null   datetime64[ns]
 3   Ship Date      9994 non-null   datetime64[ns]
 4   Ship Mode      9994 non-null   object        
 5   Customer ID    9994 non-null   object        
 6   Customer Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal Code    9994 non-null   int64         
 12  Region         9994 non-null   object        
 13  Product ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15  Sub-Category   9994 non-null   object        
 16  Product Name   9994 non-null   object        
 17  Sales          9994 non-null   float64       
 18  Quantity       9994 non-null   int64         
 19  Discount       9994 non-null   float64       
 20  Profit         9994 non-null   float64       
dtypes: datetime64[ns](2), float64(3), int64(3), object(13)
memory usage: 1.6+ MB

Create columns for easy analysis¶

In [9]:
df['Month of order']=df['Order Date'].dt.month
df['Year of order']=df['Order Date'].dt.year
df['Day of order']=df['Order Date'].dt.day_name()
In [10]:
df.head()
Out[10]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City ... Category Sub-Category Product Name Sales Quantity Discount Profit Month of order Year of order Day of order
0 1 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson ... Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136 11 2016 Tuesday
1 2 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson ... Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820 11 2016 Tuesday
2 3 CA-2016-138688 2016-06-12 2016-06-16 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles ... Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714 6 2016 Sunday
3 4 US-2015-108966 2015-10-11 2015-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310 10 2015 Sunday
4 5 US-2015-108966 2015-10-11 2015-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164 10 2015 Sunday

5 rows × 24 columns

Q1 - Monthly Sales¶

In [11]:
monthly_sales=df.groupby('Month of order')['Sales'].sum().reset_index()
fig=px.line(monthly_sales,x="Month of order", y="Sales",title="Monthly Sales Analysis")
fig.show()

In the month of November, the sales are maximun and in month of February the sales are minimum.

Q2 - Sales by Category¶

In [12]:
category_sales=df.groupby('Category')['Sales'].sum().reset_index()
fig=px.pie(category_sales,values="Sales", names='Category',title="Sales Analysis by Category", hole=0.1)
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()

Maximum sales is observed in the technology category.

Q3 - Sales by Sub-Category¶

In [13]:
subcategory_sales=df.groupby('Sub-Category')['Sales'].sum().reset_index()
fig=px.bar(subcategory_sales, x='Sub-Category',y="Sales", title="Sales Analysis by Sub Category")
#fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()

Maximum sales are observed in the chairs and phones while fasteners have minimum sales.

Q4 - Monthly Profit¶

In [14]:
monthly_sales=df.groupby('Month of order')['Profit'].sum().reset_index()
fig=px.line(monthly_sales,x="Month of order", y="Profit",title="Monthly Profit Analysis")
fig.show()

Maximum profit is in december while minimum profit is in january.

Q5 - Top 10 Products by Revenue¶

In [15]:
top_products = df.groupby('Product Name')['Sales'].sum().reset_index().sort_values(by='Sales', ascending=False).head(10)
fig = px.bar(top_products, x='Product Name', y='Sales', title='Top 10 Products by Revenue')
fig.show()

Q6 - Monthly Trend of Units Sold¶

In [16]:
monthly_units = df.groupby('Month of order')['Quantity'].sum().reset_index()
fig = px.line(monthly_units, x='Month of order', y='Quantity', title='Monthly Trend of Units Sold')
fig.show()

In month of November maximum quantity is sold.

Q7 - Year-over-Year Sales Growth¶

In [17]:
yearly_sales_growth = df.groupby('Year of order')['Sales'].sum().reset_index()
fig = px.bar(yearly_sales_growth, x="Year of order", y="Sales", title='Year-over-Year Sales Growth')
fig.update_layout(xaxis_type='category')
fig.show()

In the year of 2017 maximum sales are observed.

Q8 - Region-wise Contribution to Total Sales¶

In [18]:
region_contrib = df.groupby('Region')['Sales'].sum().reset_index()
region_contrib['Sales_Percentage'] = round(region_contrib['Sales'] / (region_contrib['Sales'].sum()) * 100, 2)
fig = px.pie(region_contrib, names='Region', values='Sales_Percentage', title='Region-wise Sales Contribution (%)')
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()

In the west region the total sales percentage is maximum which is about 31.6%

Q9 - Profit by Category and Sub-Categor¶

In [19]:
p_c=df.groupby('Category')['Profit'].sum().reset_index()
fig=px.bar(p_c, x="Category",y="Profit", title="Profit Analysis by Category")
fig.show()

In the technology category the profit is maximum and in the Furniture category the profit is minimum.

In [20]:
p_sc=df.groupby('Sub-Category')['Profit'].sum().reset_index()
fig=px.bar(p_sc, x="Sub-Category",y="Profit", title="Profit Analysis by Sub Category")
fig.show()

In the Copiers category the profit is maximum and in the Tables category the profit is minimum which is negative.

Q10 - Sales and Profit by Segment¶

In [21]:
sp_segment= df.groupby('Segment').agg({'Sales':sum, 'Profit':sum}).reset_index()
cp=colors.qualitative.Pastel
fig=go.Figure()
fig.add_trace(go.Bar(x=sp_segment['Segment'],
                     y=sp_segment['Sales'],
                     name='Sales', 
                     marker_color=cp[2]))
fig.add_trace(go.Bar(x=sp_segment['Segment'],
                     y=sp_segment['Profit'],
                     name='Profit', 
                     marker_color=cp[4]))
fig.update_layout(title='sales and profit analysis by segment',
                   xaxis_title='Customer Segment', yaxis_title='Amount')
fig.show()

The consumers provided the maximum profit in sales about 1.1M and Home officers provided the minimum profit about 0.4M.

Q11 - Sales to Profit Ratio¶

In [22]:
sp_segment = df.groupby('Segment').agg({'Sales': 'sum', 'Profit': 'sum'}).reset_index()
sp_segment['Sales_to_Profit_Ratio'] = sp_segment['Sales'] / sp_segment['Profit']
print(sp_segment[['Segment', 'Sales_to_Profit_Ratio']])
       Segment  Sales_to_Profit_Ratio
0     Consumer               8.659471
1    Corporate               7.677245
2  Home Office               7.125416

The consumer Sales to profit ratio is maximum.

Q12 - 3-Month Moving Average of Sales¶

In [28]:
df_sorted = df.sort_values('Order Date')
df_sorted['Sales_MA_3'] = df_sorted['Sales'].rolling(window=3).mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_sorted['Order Date'], y=df_sorted['Sales'], mode='lines', name='Actual Sales'))
fig.add_trace(go.Scatter(x=df_sorted['Order Date'], y=df_sorted['Sales_MA_3'], mode='lines', name='3-Month Moving Avg', line=dict(dash='dash')))
fig.update_layout(title='Sales with 3-Month Moving Average')
fig.show()

The 3 month monthly average is mostly less than the actual sales observed.

Q13 - Detect Outliers (Above 95th Percentile)¶

In [24]:
threshold = df['Sales'].quantile(0.95)
outliers = df[df['Sales'] > threshold]
fig = px.scatter(df, x='Order Date', y='Sales', title='Sales Outliers (Above 95th Percentile)', color=df['Sales'] > threshold)
fig.show()

According to the graph the Bulk order is observed on the date 18 March, 2014.

In [ ]: